/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/device.h>
#include <linux/dmapool.h>

#include "rds.h"
#include "ib.h"

void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
		          struct rds_ib_send_work *send)
{
	rdsdebug("ic %p send %p rm %p\n", ic, send, send->s_rm);
	dma_unmap_sg(ic->i_cm_id->device->dma_device,
		     send->s_rm->m_sg, send->s_rm->m_nents,
		     DMA_TO_DEVICE);
	rds_message_put(send->s_rm);
	send->s_rm = NULL;
}

void rds_ib_send_init_ring(struct rds_ib_connection *ic)
{
	struct rds_ib_send_work *send;
	u32 i;

	for(i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
		send->s_rm = NULL;

		send->s_wr.wr_id = i;
		send->s_wr.sg_list = send->s_sge;
		send->s_wr.num_sge = 1;
		send->s_wr.opcode = IB_WR_SEND;
		send->s_wr.send_flags = 0;
		send->s_wr.imm_data = 0;

		send->s_sge[0].lkey = ic->i_mr->lkey;

		send->s_sge[1].addr = ic->i_send_hdrs_dma +
				      (i * sizeof(struct rds_header));
		send->s_sge[1].length = sizeof(struct rds_header);
		send->s_sge[1].lkey = ic->i_mr->lkey;
	}
}

void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
{
	struct rds_ib_send_work *send;
	u32 i;

	for(i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
		if (send->s_rm)
			rds_ib_send_unmap_rm(ic, send);
	}
}

/*
 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 
 * operations performed in the send path.  As the sender allocs and potentially
 * unallocs the next free entry in the ring it doesn't alter which is
 * the next to be freed, which is what this is concerned with.
 */
void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
{
	struct rds_connection *conn = context;
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct ib_wc wc;
	struct rds_ib_send_work *send;
	u32 completed;
	u32 oldest;
	u32 i = 0;
	int ret;

	rdsdebug("cq %p conn %p\n", cq, conn);
	rds_ib_stats_inc(s_ib_tx_cq_call);
	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
	if (ret) {
		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
	}

	while (ib_poll_cq(cq, 1, &wc) > 0 ) {
		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
			 be32_to_cpu(wc.imm_data));
		rds_ib_stats_inc(s_ib_tx_cq_event);

		if (wc.wr_id == ~0) {
			rds_ib_ack_send_complete(ic);
			continue;
		}

		oldest = rds_ib_ring_oldest(&ic->i_send_ring);
		send = &ic->i_sends[oldest];

		completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);

		for (i = 0; i < completed; i++) {
			if (send->s_rm)
				rds_ib_send_unmap_rm(ic, send);
			send->s_wr.num_sge = 1;
			if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
				send = ic->i_sends;
		}
		rds_ib_ring_free(&ic->i_send_ring, completed);

		/* We expect errors as the qp is drained during shutdown */
		if (wc.status != IB_WC_SUCCESS && !ic->i_wc_err) {
			ic->i_wc_err = 1;
			printk(KERN_WARNING "RDS/IB: completion on "
			       "%u.%u.%u.%u had status %u, disconnecting and "
			       "reconnecting\n", NIPQUAD(conn->c_faddr),
			       wc.status);
			queue_work(rds_wq, &conn->c_down_w);
		}
	}
}

int rds_ib_xmit_cong_map(struct rds_connection *conn,
			 struct rds_cong_map *map, unsigned long offset)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct rds_ib_send_work *send;
	struct rds_ib_send_work *first;
	struct rds_ib_send_work *prev;
	struct ib_send_wr *failed_wr;
	struct scatterlist *scat;
	struct rds_header *hdr;
	unsigned long i;
	unsigned int off;
	u32 pos;
	u32 work_alloc;
	int ret;

	BUG_ON(offset != 0);

	/* map bitmap pages once */
	if (ic->i_map_count == 0) {
		for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
			ic->i_map_sg[i].page = 
				virt_to_page(map->m_page_addrs[i]);
			ic->i_map_sg[i].offset = 0;
			ic->i_map_sg[i].length = PAGE_SIZE;
		}
		ic->i_map_count = dma_map_sg(ic->i_cm_id->device->dma_device,
					 ic->i_map_sg,
					 ARRAY_SIZE(ic->i_map_sg),
					 DMA_TO_DEVICE); /* XXX ? */
		if (ic->i_map_count == 0) {
			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
			ret = -ENOMEM;
			goto out;
		}
	}

		rds_ib_stats_inc(s_ib_tx_ring_full);
	/*
	 * Instead of knowing how to return a partial send we insist that there
	 * be enough work requests to send the entire bitmap.
	 */
	i = ceil(RDS_CONG_MAP_BYTES, RDS_FRAG_SIZE),
	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
	if (work_alloc != i) {
		ret = -ENOMEM;
		goto out;
	}

	send = &ic->i_sends[pos];

	/* build the header and include it in the first wr */
	hdr = &ic->i_send_hdrs[pos];
	memset(hdr, 0, sizeof(struct rds_header));
	hdr->h_flags = RDS_FLAG_CONG_BITMAP;
	hdr->h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
	send->s_wr.num_sge = 2;

	/* 
	 * Now iterate through the work reqs, pointing them at the bits of the
	 * bitmap pages that the correspond to.  RDS_FRAG_SIZE has to be less
	 * than sg granularity or the ceil() above would have to be changed.
	 * We've allocated all the work reqs for the bitmaps so we only signal
	 * on the final wr.
	 */
	scat = ic->i_map_sg;
	off = 0;
	first = send;
	prev = NULL;
	for(i = 0; i < work_alloc ; i++) {
		send->s_sge[0].addr = sg_dma_address(scat) + off;
		send->s_sge[0].length = min(RDS_FRAG_SIZE,
					    sg_dma_len(scat) - off);
		send->s_wr.send_flags = 0;
		if (prev)
			prev->s_wr.next = &send->s_wr;

		off += send->s_sge[0].length;
		if (off == sg_dma_len(scat)) {
			scat++;
			off = 0;
		}

		prev = send;
		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
			send = ic->i_sends;
	}

	prev->s_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
	prev->s_wr.next = NULL;

	/* XXX need to worry about failed_wr and partial sends. */
	failed_wr = &first->s_wr;
	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 
		 first, &first->s_wr, ret, failed_wr); 
	if (ret) {
		printk(KERN_WARNING "RDS/IB: ib_post_send to %u.%u.%u.%u "
		       "returned %d\n", NIPQUAD(conn->c_faddr), ret);
		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
	} else
		ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
out:
	return ret;
}

/*
 * This can be called multiple times for a given message.  The first time
 * we see a message we map its scatterlist into the IB device so that
 * we can provide that mapped address to the IB scatter gather entries
 * in the IB work requests.  We translate the scatterlist into a series
 * of work requests that fragment the message.  These work requests complete
 * in order so we pass ownership of the message to the completion handler
 * once we send the final fragment. 
 *
 * The RDS core uses the c_send_sem to only enter this function once
 * per connection.  This makes sure that the tx ring alloc/unalloc pairs
 * don't get out of sync and confuse the ring.
 */
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
	        unsigned int hdr_off, unsigned int sg, unsigned int off)
{
	struct rds_ib_connection *ic = conn->c_transport_data;
	struct rds_ib_send_work *send = NULL;
	struct rds_ib_send_work *first;
	struct rds_ib_send_work *prev;
	struct ib_send_wr *failed_wr;
	struct scatterlist *scat;
	unsigned long len;
	u32 pos;
	u32 i;
	u32 work_alloc;
	int sent;
	int ret;
	static u32 unsignaled_wrs_count;

	BUG_ON(off % RDS_FRAG_SIZE);
	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));

	/* map the message the first time we see it */
	if (ic->i_rm == NULL && rm->m_nents) {
		rm->m_count = dma_map_sg(ic->i_cm_id->device->dma_device,
					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
		rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
		unsignaled_wrs_count = 0;
		if (rm->m_count == 0) {
			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
			ret = -ENOMEM; /* XXX ? */
			goto out;
		}
		rds_message_addref(rm);
		ic->i_rm = rm;
	}

	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
		i = 1;
	else
		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);

	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
	if (work_alloc == 0) {
		rds_ib_stats_inc(s_ib_tx_ring_full);
		ret = -ENOMEM;
		goto out;
	}

	send = &ic->i_sends[pos];
	first = send;
	prev = NULL;
	scat = &rm->m_sg[sg];
	sent = 0;
	i = 0;

	/*
	 * We could be copying the header into the unused tail of the page.
	 * That would need to be changed in the future when those pages might
	 * be mapped userspace pages or page cache pages.  So instead we always
	 * use a second sge and our long-lived ring of mapped headers.  We send
	 * the header after the data so that the data payload can be aligned on
	 * the receiver.
	 */
	if (hdr_off == 0) {
		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
		       sizeof(struct rds_header));
		sent += sizeof(struct rds_header);
		first->s_wr.num_sge = 2;
	}

	/* handle a 0-len message */
	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
		first->s_sge[0].addr = 0;
		first->s_sge[0].length = 0;
		first->s_wr.next = NULL;
	      	/* as though we walked the scatter list */
		i = 1;
		prev = first;
	}

	/* if there's data reference it with a chain of work reqs */
	for(; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {

		++unsignaled_wrs_count;

                /* 
                 * We want to delay signaling completions just enough to get 
                 * the batching benefits but not so much that we create dead time on the wire. 
                 */
		if ( rds_ib_sysctl_max_unsig_wrs > 0 && unsignaled_wrs_count >= rds_ib_sysctl_max_unsig_wrs ) {
			unsignaled_wrs_count = 0;
			send->s_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
		}
		else
			send->s_wr.send_flags = 0;

		send->s_wr.next = NULL;
		if (prev)
			prev->s_wr.next = &send->s_wr;

		len = min(RDS_FRAG_SIZE, sg_dma_len(scat) - off);
		send->s_sge[0].addr = sg_dma_address(scat) + off;
		send->s_sge[0].length = len;

		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);

		sent += len;
		off += len;
		if (off == sg_dma_len(scat)) {
			scat++;
			off = 0;
		}

		prev = send;
		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
			send = ic->i_sends;
	}

	/* if we finished the message then send completion owns it */
	if (scat == &rm->m_sg[rm->m_count]) {
		prev->s_rm = ic->i_rm;
		prev->s_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
		ic->i_rm = NULL;
	}

	if (i < work_alloc) {
		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
		work_alloc = i;
	}

	/* XXX need to worry about failed_wr and partial sends. */
	failed_wr = &first->s_wr;
	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 
		 first, &first->s_wr, ret, failed_wr); 
	BUG_ON(failed_wr != &first->s_wr);
	if (ret) {
		printk(KERN_WARNING "RDS/IB: ib_post_send to %u.%u.%u.%u "
		       "returned %d\n", NIPQUAD(conn->c_faddr), ret);
		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
		if (prev->s_rm) {
			ic->i_rm = prev->s_rm;
			prev->s_rm = NULL;
		}
		goto out;
	}

	ret = sent;
out:
	return ret;
}
